pacman::p_load(ggplot2, dplyr, RMySQL, lubridate, psych, tidyr, plotly, padr, imputeTS, fpp2)
## also installing the dependency 'fma'
## 
## The downloaded binary packages are in
##  /var/folders/cj/pkfd827d2hb4v282d82v3cch0000gn/T//RtmpnSqV0G/downloaded_packages
## 
## fpp2 installed
#remove scientific notation in r
options(scipen=999)

## Create a database connection 
con = dbConnect(MySQL(), user='deepAnalytics', password='Sqltask1234!', dbname='dataanalytics2018', host='data-analytics-2018.cbrosir2cswx.us-east-1.rds.amazonaws.com')

#Query 
yr_2006 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2006")
yr_2007 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2007")
yr_2008 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2008")
yr_2009 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2009")
yr_2010 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2010")

#Combine tables into one dataframe
All_Years <- bind_rows(yr_2007, yr_2008, yr_2009, yr_2010)



#PREPROCESSING

## Combine Date and Time attribute values in a new attribute column with Paste
All_Years <-cbind(All_Years,paste(All_Years$Date,All_Years$Time), stringsAsFactors=FALSE)

## Give the new attribute in the 6th column a header name change the name
colnames(All_Years)[6] <-"DateTime"

## And move the DateTime attribute within the dataset
All_Years <- All_Years[,c(ncol(All_Years), 1:(ncol(All_Years)-1))]

## Convert DateTime from POSIXlt to POSIXct 
All_Years$DateTime <- as.POSIXct(All_Years$DateTime, "%Y/%m/%d %H:%M:%S")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown
## timezone '%Y/%m/%d %H:%M:%S'
## Add the time zone
attr(All_Years$DateTime, "tzone") <- "GMT+0"

#Separate daytime in different attributes
All_Years$year <- year(All_Years$DateTime)
All_Years$month <- month(All_Years$DateTime)
All_Years$weekday <- weekdays(All_Years$DateTime)
All_Years$day <- day(All_Years$DateTime)
All_Years$hour <- hour(All_Years$DateTime)
All_Years$minute <- minute(All_Years$DateTime)

#Rename Sub_meterings columns
names(All_Years)[names(All_Years) == "Sub_metering_1"] <- "Kitchen"
names(All_Years)[names(All_Years) == "Sub_metering_2"] <- "Laundry"
names(All_Years)[names(All_Years) == "Sub_metering_3"] <- "AC_Heater"


 #Gather all sub_meterings in two attributes (Submetering - Value)
All_Years_Comb_Submeterings <- All_Years %>% gather(Sub_metering, Value, Kitchen:AC_Heater)
#EXPLORATION OF THE DATA

#How many different values we have for the total sub_meterings?
Values_n <-  All_Years_Comb_Submeterings %>% group_by(Value) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
 
#There are 88 different values. 

#Can we convert those values as Factors?
All_Years_Comb_Submeterings$Value = as.factor(All_Years_Comb_Submeterings$Value)

#Visualize the distribution of the Values
Values_n_plot <- ggplot(data=Values_n) + geom_col(mapping = aes(x=Value, y=n))
Values_n_plot %>%  ggplotly()
#If we group the values we will find a normal distribution for each group: 
#[0, 1, 2], [17, 18, 19], [36, 37,38, 39]
#[11,12,13], [28,29,30]

#That means that we have settled values and not much oscillation.
#Now let's try to find out from which sub_metering those peaks belong.

#Histogram of Values for each sub_metering 
Plot_All_Years_Comb_Submeterings <-  ggplot(All_Years_Comb_Submeterings, aes(Value, fill = Sub_metering)) +  geom_bar() 

Plot_All_Years_Comb_Submeterings %>%  ggplotly()
#Histograms of the three submetertings
 hist(All_Years$Kitchen)

 hist(All_Years$Laundry)

 hist(All_Years$AC_Heater)

 #BoxPlots of the three submeterings
 boxplot(Kitchen~year,data=All_Years) 

 boxplot(Laundry~year,data=All_Years) 

 boxplot(AC_Heater~year,data=All_Years) 

 ggplot(All_Years_Comb_Submeterings, aes(Sub_metering, Value )) + geom_boxplot() + coord_flip() + facet_grid(.~year)

#Statistics
summary(All_Years) 
##     DateTime                       Date               Time          
##  Min.   :2007-01-01 00:00:00   Length:2027288     Length:2027288    
##  1st Qu.:2007-12-21 16:32:45   Class :character   Class :character  
##  Median :2008-12-07 16:38:30   Mode  :character   Mode  :character  
##  Mean   :2008-12-09 17:30:05                                        
##  3rd Qu.:2009-11-27 16:09:15                                        
##  Max.   :2010-11-26 21:02:00                                        
##     Kitchen          Laundry         AC_Heater           year     
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   :2007  
##  1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.:2007  
##  Median : 0.000   Median : 0.000   Median : 1.000   Median :2008  
##  Mean   : 1.121   Mean   : 1.289   Mean   : 6.448   Mean   :2008  
##  3rd Qu.: 0.000   3rd Qu.: 1.000   3rd Qu.:17.000   3rd Qu.:2009  
##  Max.   :88.000   Max.   :80.000   Max.   :31.000   Max.   :2010  
##      month          weekday               day             hour     
##  Min.   : 1.000   Length:2027288     Min.   : 1.00   Min.   : 0.0  
##  1st Qu.: 3.000   Class :character   1st Qu.: 8.00   1st Qu.: 5.0  
##  Median : 6.000   Mode  :character   Median :16.00   Median :12.0  
##  Mean   : 6.394                      Mean   :15.62   Mean   :11.5  
##  3rd Qu.: 9.000                      3rd Qu.:23.00   3rd Qu.:18.0  
##  Max.   :12.000                      Max.   :31.00   Max.   :23.0  
##      minute    
##  Min.   : 0.0  
##  1st Qu.:15.0  
##  Median :30.0  
##  Mean   :29.5  
##  3rd Qu.:44.0  
##  Max.   :59.0
describe(All_Years) 
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
##           vars       n    mean    sd  min  max range   se
## DateTime     1 2027288     NaN    NA  Inf -Inf  -Inf   NA
## Date         2 2027288     NaN    NA  Inf -Inf  -Inf   NA
## Time         3 2027288     NaN    NA  Inf -Inf  -Inf   NA
## Kitchen      4 2027288    1.12  6.15    0   88    88 0.00
## Laundry      5 2027288    1.29  5.79    0   80    80 0.00
## AC_Heater    6 2027288    6.45  8.43    0   31    31 0.01
## year         7 2027288 2008.45  1.10 2007 2010     3 0.00
## month        8 2027288    6.39  3.39    1   12    11 0.00
## weekday      9 2027288     NaN    NA  Inf -Inf  -Inf   NA
## day         10 2027288   15.62  8.80    1   31    30 0.01
## hour        11 2027288   11.50  6.92    0   23    23 0.00
## minute      12 2027288   29.50 17.32    0   59    59 0.01
#We see that we have a lot of 0's. Which is the influence of the low values in the total? In terms of mean? 
 
#REEEMOVEE
#What if i remove those low values?
All_Years_Greater_2 <- All_Years %>% filter(Kitchen>2, Laundry>2, AC_Heater>2)
summary(All_Years_Greater_2) 
##     DateTime                       Date               Time          
##  Min.   :2007-01-07 13:15:00   Length:4517        Length:4517       
##  1st Qu.:2007-08-08 11:01:00   Class :character   Class :character  
##  Median :2008-05-06 15:46:00   Mode  :character   Mode  :character  
##  Mean   :2008-07-30 20:36:25                                        
##  3rd Qu.:2009-05-16 14:37:00                                        
##  Max.   :2010-11-22 10:19:00                                        
##     Kitchen         Laundry        AC_Heater         year     
##  Min.   : 3.00   Min.   : 3.00   Min.   : 3.0   Min.   :2007  
##  1st Qu.:32.00   1st Qu.:15.00   1st Qu.:17.0   1st Qu.:2007  
##  Median :37.00   Median :31.00   Median :17.0   Median :2008  
##  Mean   :32.73   Mean   :27.69   Mean   :17.3   Mean   :2008  
##  3rd Qu.:38.00   3rd Qu.:36.00   3rd Qu.:18.0   3rd Qu.:2009  
##  Max.   :81.00   Max.   :76.00   Max.   :30.0   Max.   :2010  
##      month          weekday               day             hour      
##  Min.   : 1.000   Length:4517        Min.   : 1.00   Min.   : 0.00  
##  1st Qu.: 2.000   Class :character   1st Qu.: 8.00   1st Qu.:12.00  
##  Median : 5.000   Mode  :character   Median :16.00   Median :14.00  
##  Mean   : 5.648                      Mean   :15.67   Mean   :15.05  
##  3rd Qu.:10.000                      3rd Qu.:23.00   3rd Qu.:19.00  
##  Max.   :12.000                      Max.   :31.00   Max.   :23.00  
##      minute     
##  Min.   : 0.00  
##  1st Qu.:15.00  
##  Median :30.00  
##  Mean   :29.71  
##  3rd Qu.:44.00  
##  Max.   :59.00
describe(All_Years_Greater_2) 
## Warning in describe(All_Years_Greater_2): NAs introduced by coercion
## Warning in describe(All_Years_Greater_2): NAs introduced by coercion

## Warning in describe(All_Years_Greater_2): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
##           vars    n    mean    sd median trimmed   mad  min  max range
## DateTime     1 4517     NaN    NA     NA     NaN    NA  Inf -Inf  -Inf
## Date*        2 4517     NaN    NA     NA     NaN    NA  Inf -Inf  -Inf
## Time*        3 4517     NaN    NA     NA     NaN    NA  Inf -Inf  -Inf
## Kitchen      4 4517   32.73 10.73     37   34.27  1.48    3   81    78
## Laundry      5 4517   27.69 17.24     31   26.53 10.38    3   76    73
## AC_Heater    6 4517   17.30  1.97     17   17.26  1.48    3   30    27
## year         7 4517 2008.15  1.07   2008 2008.06  1.48 2007 2010     3
## month        8 4517    5.65  3.82      5    5.45  4.45    1   12    11
## weekday*     9 4517     NaN    NA     NA     NaN    NA  Inf -Inf  -Inf
## day         10 4517   15.67  8.46     16   15.62 10.38    1   31    30
## hour        11 4517   15.05  4.47     14   15.19  4.45    0   23    23
## minute      12 4517   29.71 16.84     30   29.77 20.76    0   59    59
##            skew kurtosis   se
## DateTime     NA       NA   NA
## Date*        NA       NA   NA
## Time*        NA       NA   NA
## Kitchen   -0.70     2.72 0.16
## Laundry    0.49     0.40 0.26
## AC_Heater  1.01    21.30 0.03
## year       0.41    -1.14 0.02
## month      0.36    -1.30 0.06
## weekday*     NA       NA   NA
## day        0.00    -1.13 0.13
## hour      -0.39     0.14 0.07
## minute    -0.04    -1.14 0.25
#Are there any NAs?
 summary(is.na(All_Years))
##   DateTime          Date            Time          Kitchen       
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:2027288   FALSE:2027288   FALSE:2027288   FALSE:2027288  
##   Laundry        AC_Heater          year           month        
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:2027288   FALSE:2027288   FALSE:2027288   FALSE:2027288  
##   weekday           day             hour           minute       
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:2027288   FALSE:2027288   FALSE:2027288   FALSE:2027288
#EXPLORATION OF THE DATA. PLOTS
 
# Sum
 
All_Years %>% group_by(year, hour) %>% summarise(Total_Kitchen=sum(Kitchen), Total_Laundry=sum(Laundry), Total_AC_Heater=sum(AC_Heater)) %>% ggplot(aes(x = hour)) +
 geom_line(aes(y = Total_Kitchen, col = "Total_Kitchen")) + geom_point((aes(y= Total_Kitchen, col = "Total_Kitchen"))) +
 geom_line(aes(y = Total_Laundry, col = "Total_Laundry")) + geom_point((aes(y= Total_Laundry, col = "Total_Laundry"))) +
 geom_line(aes(y = Total_AC_Heater, col = "Total_AC_Heater")) +
 geom_point((aes(y= Total_AC_Heater, col = "Total_AC_Heater"))) +
 scale_x_continuous(breaks = c(1:24), labels = c(1:24), limits = c(1,24)) +
 ylab("Energy consumtion") + facet_wrap(~year, scales = "free_x")
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 4 rows containing missing values (geom_point).

#Mean
All_years_mean_by_year_hour <- All_Years %>% group_by(year, hour) %>% summarise(Mean_Kitchen=mean(Kitchen), Mean_Laundry=mean(Laundry), Mean_AC_Heater=mean(AC_Heater)) %>% ggplot(aes(x = hour)) +
 geom_line(aes(y = Mean_Kitchen, col = "Mean_Kitchen")) + geom_point((aes(y= Mean_Kitchen, col = "Mean_Kitchen"))) +
 geom_line(aes(y = Mean_Laundry, col = "Mean_Laundry")) + geom_point((aes(y= Mean_Laundry, col = "Mean_Laundry"))) +
 geom_line(aes(y = Mean_AC_Heater, col = "Mean_AC_Heater")) +
 geom_point((aes(y= Mean_AC_Heater, col = "Mean_AC_Heater"))) +
 scale_x_continuous(breaks = c(1:24), labels = c(1:24), limits = c(1,24)) +
 ylab("Energy consumtion") + facet_wrap(~year, scales = "free_x")
 


  
 
  
 #Plot the mean of submeterings by year. Exclude watts below 2
 All_Years %>% group_by(year, hour) %>% filter(Kitchen>2, Laundry>2, AC_Heater>2) %>% summarise(Mean_Kitchen=mean(Kitchen), Mean_Laundry=mean(Laundry), Mean_AC_Heater=mean(AC_Heater)) %>% ggplot( aes(x = hour)) +
 geom_line(aes(y = Mean_Kitchen, col = "Mean_Kitchen")) + geom_point((aes(y= Mean_Kitchen, col = "Mean_Kitchen"))) +
 geom_line(aes(y = Mean_Laundry, col = "Mean_Laundry")) + geom_point((aes(y= Mean_Laundry, col = "Mean_Laundry"))) +
 geom_line(aes(y = Mean_AC_Heater, col = "Mean_AC_Heater")) +
 geom_point((aes(y= Mean_AC_Heater, col = "Mean_AC_Heater"))) +
 scale_x_continuous(breaks = c(1:24), labels = c(1:24), limits = c(1,24)) +
 ylab("Energy consumtion") + facet_wrap(~year, scales = "free_x")
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing missing values (geom_point).

 #?
 All_Years_Comb_Submeterings %>% group_by(day) %>%ggplot(aes(x=day,y=Value)) +  geom_col() 

 #MISSING ROWS
 
 All_Years_PAD <- All_Years
 All_Years_PAD$DateTime %>% get_interval()
## [1] "min"
   #missing rows by week = 0 and by month = 0 
 
 All_Years_PAD_month <- All_Years_PAD  %>% select(DateTime, Kitchen) %>% thicken('month') %>% group_by(DateTime_month) %>%  summarise(Kitchen_No = sum(Kitchen)) %>% pad()
## pad applied on the interval: month
 summary(is.na(All_Years_PAD_month)) #but we know that there is a missing month, which is december 2010
##  DateTime_month  Kitchen_No     
##  Mode :logical   Mode :logical  
##  FALSE:47        FALSE:47
  #missing rows by day = 9
 
 All_Years_PAD_day <- All_Years_PAD  %>% select(DateTime, Kitchen) %>% thicken('day') %>% group_by(DateTime_day) %>%  summarise(Kitchen_No = sum(Kitchen)) %>% pad()
## pad applied on the interval: day
 summary(is.na(All_Years_PAD_day))
##  DateTime_day    Kitchen_No     
##  Mode :logical   Mode :logical  
##  FALSE:1426      FALSE:1417     
##                  TRUE :9
  #missing rows by hour = 421
 All_Years_PAD_hour <- All_Years_PAD  %>% select(DateTime, Kitchen) %>% thicken('hour') %>% group_by(DateTime_hour) %>%  summarise(Kitchen_No = sum(Kitchen)) %>% pad()
## pad applied on the interval: hour
 summary(is.na(All_Years_PAD_hour))
##  DateTime_hour   Kitchen_No     
##  Mode :logical   Mode :logical  
##  FALSE:34222     FALSE:33801    
##                  TRUE :421
#missing rows by minute = 25975
     All_Years_PAD_min <- All_Years_PAD %>% pad(by="DateTime", break_above = 3)
## pad applied on the interval: min
     summary(is.na(All_Years_PAD_min))
##   DateTime          Date            Time          Kitchen       
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:2053263   FALSE:2027288   FALSE:2027288   FALSE:2027288  
##                  TRUE :25975     TRUE :25975     TRUE :25975    
##   Laundry        AC_Heater          year           month        
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:2027288   FALSE:2027288   FALSE:2027288   FALSE:2027288  
##  TRUE :25975     TRUE :25975     TRUE :25975     TRUE :25975    
##   weekday           day             hour           minute       
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:2027288   FALSE:2027288   FALSE:2027288   FALSE:2027288  
##  TRUE :25975     TRUE :25975     TRUE :25975     TRUE :25975
#Plot missing values with imputeTS package
    #min
     as.ts(All_Years_PAD_min$Kitchen) %>% plotNA.distribution()

    #hours
     as.ts(All_Years_PAD_hour$Kitchen_No)  %>% plotNA.distribution()

     #days
     as.ts(All_Years_PAD_day$Kitchen_No)  %>% plotNA.distribution()

     #All_Years_PAD %>% select(DateTime, Kitchen) %>% thicken('hour') %>% group_by(DateTime_hour)  %>% pad(by="DateTime") 
      
       
        plotNA.distribution(as.ts(All_Years_PAD_hour$Kitchen_No))

       #All_Years_PAD_hour_2 <- All_Years_PAD %>% thicken('hour') %>% pad(by="DateTime_hour", break_above = 3)
       
       # summary(is.na(All_Years_PAD_hour_2))
       # 
       # All_Years_PAD_hour_2 %>% group_by(year) %>% plotNA.distribution()
       # 
       # plotNA.distribution(All_Years_PAD_hour_2$Kitchen)
       # 
       as.ts(All_Years_PAD_hour$Kitchen_No)  %>% plotNA.distribution( cexPoints = 2)

#Visualizing a single day (09/01/2008)
      
houseDay <- filter(All_Years, year == 2008 & month == 1 & day == 9)

## Plot the kitchen
plot_ly(houseDay, x = ~houseDay$DateTime, y = ~houseDay$Kitchen, type = 'scatter', mode = 'lines')
## Plot sub-meter 1, 2 and 3 with title, legend and labels - All observations
plot_ly(houseDay, x = ~houseDay$DateTime, y = ~houseDay$Kitchen, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~houseDay$Laundry, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~houseDay$AC_Heater, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption January 9th, 2008",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)")) 
#Reducing granularity

## Subset the 9th day of January 2008 - 10 Minute frequency
houseDay10min <- filter(All_Years, year == 2008 & month == 1 & day == 9 & (minute == 0 | minute == 10 | minute == 20 | minute == 30 | minute == 40 | minute == 50))

## Plot sub-meter 1, 2 and 3 with title, legend and labels - 10 Minute frequency
plot_ly(houseDay10min, x = ~houseDay10min$DateTime, y = ~houseDay10min$Kitchen, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~houseDay10min$Laundry, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~houseDay10min$AC_Heater, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption January 9th, 2008",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))